package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;

/**
 *  InheNewsAnalyse  银河网新闻解析类
 *  http://weather.inhe.net/ 石家庄市气象局
 *  xiaonie 2012-11-22
 */
public class InheNewsAnalyse implements AnalyseNews{
	private final String[] regexs={
			"http://www.inhe.net/news/.*?[0-9]{6}/t[0-9]{8}_[0-9]+.htm",
			"http://house.inhe.net/news/[0-9]{4}/[0-9]{4}/[0-9]+.shtml",
			"http://life.inhe.net/article.php[?]itemid=[0-9]+",
			"http://auto.inhe.net/[0-9]{4}/[0-9]{4}/[0-9]+.html",
			"http://jiaju.inhe.net/jiancai/.*?[0-9]{6}/t[0-9]{8}_[0-9]+.htm",
			"http://it.inhe.net/[0-9]*/.*.shtml",
			"http://marry.inhe.net/content-[0-9]{2}-[0-9]{4}-[0-9]{1}.html",
			"http://weather.inhe.net/shownews.asp[?]id=[0-9]*",
			"http://www.inhe.net/news/show-[0-9]+.html",
			};
	
	
	public boolean isDetailPage(String url) {
		for (String regex : regexs) {
			if (url.matches(regex)) {
				return true;
			}
		}
		return false;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();
		if (!isDetailPage(url)) {
		}
		Document doc=Jsoup.parse(htmltxt);
		String title=null;
		String content=null;
		Long date=null;
		String dateStr=null;
		String author = null;
		if (htmltxt != null && htmltxt != "") {
			news.setUrl(url);
			if(url.startsWith("http://jiaju.inhe.net/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.font_14.ptb1"));
				Element element=doc.select("h2").get(0);
				date=DateUtils.matchDate(element.toString());
			}else if(url.startsWith("http://marry.inhe.net/") || url.startsWith("http://life.inhe.net/")){
				try {
					Element element=doc.select("div.item-t").get(0);
					title=element.text().toString();
					content=HtmlCleaner.getContentHtml(url,doc.select("span#content"));
					dateStr=doc.select("div.items").text();
					date=DateUtils.matchDate(dateStr);
				} catch (Exception e) {
				}
			}else if(url.startsWith("http://auto.inhe.net/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div#contentTxt"));
				dateStr=doc.select("h3").text();
				date=DateUtils.matchDate(dateStr);
			}else if(url.startsWith("http://house.inhe.net/")){
				title=doc.select("div.articletitle").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div#ctrlfscont"));
				dateStr=doc.select("div.titBar > div.fr > div.fl").text();
				date=DateUtils.matchDate(dateStr);
			}else if(url.startsWith("http://it.inhe.net/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.nry_zw"));
				dateStr=doc.select("div.nry_wxcc").text();
				date=DateUtils.matchDate(dateStr);
			}else if (url.startsWith("http://weather.inhe.net/")) {
				title=doc.select("html body table tbody tr td table.bk tbody tr td div.STYLE9").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("html body table tbody tr td table.bk tbody tr:eq(2)"));
				dateStr=doc.select("html body table tbody tr td table.bk tbody tr td div.a1 span.STYLE12").text();
				date=DateUtils.matchDate(dateStr); }
//			}else{
//				title=doc.select("strong").get(0).text();
//				dateStr=doc.select("div[align=center]").get(1).text();
//				content=HtmlCleaner.getContentHtml(url,doc.select("div.TRS_Editor").get(0));
//				date=DateUtils.matchDate(dateStr);
//			}
			if(StringUtils.isBlank(title)){
				title=doc.select("h1").text();
			}
			if(StringUtils.isBlank(content)){
				content=HtmlCleaner.getContentHtml(url,doc.select("div#cookieContent"));
			}
			if(StringUtils.isBlank(content)){
				content=HtmlCleaner.getContentHtml(url,doc.select("td#postmessage_6626073.t_f"));
			}
			if(date==null){
				date=DateUtils.matchDate(url);
			}
			if(date==null){
				date=DateUtils.matchDate(doc.select("span").text());
			}
			
			if(title == null || title.trim().length() == 0){
			}
			
			author=JSoupUtils.matchAuthor(doc, "来源：");
			
			news.setContent(content);
			news.setTitle(title);
			news.setClickNum(null);
			news.setCommentNum(null);
			news.setDate(date);
			news.setType(1);
			news.setAuthor(author);
		}
		return news;
	}

	public static void main(String[] args) {
		InheNewsAnalyse a=new InheNewsAnalyse();
		String url="http://bbs.inhe.net/thread-2654667-1-1.html";
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(a.isDetailPage(url));
		NewsMeta parserHtml=a.parserHtml(meta);
		System.out.println(parserHtml);
	}

	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}

	
	public boolean isNeedUpdate(){
		return false;
	}
}
